## Importing libraries for the work
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot
import seaborn as sns


## Importing the dataframe into Pandas
amazon = pd.read_excel(r"C:\Users\User Pc\Desktop\Datasets\Online sales Power Querry  & R\Amazon store prepared data.xlsx")
amazon.head(3)


## Getting a general overview of all aspects of the data
amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551585 entries, 0 to 551584
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Browser ID      551585 non-null  int64  
 1   name            551585 non-null  object 
 2   main_category   551585 non-null  object 
 3   sub_category    551585 non-null  object 
 4   image           551585 non-null  object 
 5   link            551585 non-null  object 
 6   ratings         369558 non-null  float64
 7   no_of_ratings   369558 non-null  float64
 8   discount_price  490422 non-null  object 
 9   actual_price    533772 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 42.1+ MB


## Getting an overview of the numeric aspects of the data
## The site had 551,585 browser-attempts
amazon.describe().transpose()


## Total no of browser-attempts on the site
amazon['Browser ID'].count()

551585


## Confirming if the Browser ID list contains duplicates, and was found so
amazon['Browser ID'].is_unique

False


## Identifying all the duplicates in the list
amazon.pivot_table(index=['Browser ID'], aggfunc='size').unique()

array([112, 109, 108, 107, 106, 105, 104, 102, 101, 100,  99,  98,  95,
        93,  92,  91,  89,  86,  84,  78,  77,  74,  73,  70,  61,  60,
        57,  55,  54,  50,  49,  47,  45,  44,  43,  41,  39,  37,  36,
        33,  31,  30,  29,  28,  25,  18,  16,  11,  10,   6], dtype=int64)


## Total number of unique browsers on the site
len(amazon.pivot_table(index=['Browser ID'], aggfunc='size'))

19200


## Main categories by their traffic levels on the site
main_category_traffic_levels = amazon.groupby('main_category')['Browser ID'].count().sort_values(ascending=False)
main_category_traffic_levels[:10]

main_category
accessories            116141
men's clothing          76656
women's clothing        76512
tv, audio & cameras     68659
men's shoes             57456
appliances              33096
stores                  32903
home & kitchen          14568
kids' fashion           13488
sports & fitness        12648
Name: Browser ID, dtype: int64


## Visual rep. of Main category by their respective traffic levels
main_category_traffic_levels[:10].plot(kind='bar', color='brown')

<Axes: xlabel='main_category'>


## Main categories by their unique customer attempts on the site
main_category_by_unique_browserID = amazon.groupby('main_category')['Browser ID'].nunique().sort_values(ascending=False)
main_category_by_unique_browserID

main_category
men's clothing             19200
men's shoes                19200
women's clothing           19200
stores                     19200
accessories                19152
appliances                  9600
tv, audio & cameras         9600
kids' fashion               2736
beauty & health             2640
bags & luggage              2208
women's shoes               2208
industrial supplies         1440
home & kitchen              1416
car & motorbike             1392
toys & baby products        1392
grocery & gourmet foods     1296
sports & fitness            1296
music                       1080
pet supplies                 984
home, kitchen, pets           24
Name: Browser ID, dtype: int64


## Visual rep. of Main category by their unique customer attempts
main_category_by_unique_browserID[:10].plot(kind='bar', grid=True, color='green')

<Axes: xlabel='main_category'>


## Main category products with the highest ratings on the site
main_category_ratings= amazon.groupby('main_category', 
                       as_index=False)['ratings'].sum().sort_values(by='ratings', 
                        ascending=False)[:10]
main_category_ratings


## Viz of main_category by their total customer ratings of the products on the site
main_category_ratings.plot(kind='bar', x='main_category', y='ratings', color='purple')

<Axes: xlabel='main_category'>


## Preparing the data for the process
amazon['actual_price'].fillna(0, inplace=True)


## Contd data prepration for the correlation analysis
amazon['discount_price'].fillna(0, inplace=True)


## Contd data prepration for the correlation analysis
amazon['discount_price'] = amazon['discount_price'].replace('[^\d.]', '', regex=True).astype(float)


## Contd data prepration for the correlation analysis
amazon['actual_price'] = amazon['actual_price'].replace('[^\d.]', '', regex=True).astype(float)


## The correlation data to analyze variable relationships
corr_df = amazon.drop(['Browser ID', 'name', 'main_category', 'sub_category', 'image', 'link'], axis=1).corr(method='pearson')


## Correlation matrix
corr_df.style.background_gradient(cmap=\
                                  'coolwarm')


## As seen above, the result shows that there is 

## An Inverse relationship between ratings, discount and actual price 

## meaning; the lower the discount or actual price, the higher the rating and number of ratings a product gets

## There is also a strong positive  relationship between discount and actual price as exoected

## meaning; the higher the actual price, the higher the discount amount on specific products

	Browser ID	name	main_category	sub_category	image	link	ratings	no_of_ratings	discount_price	actual_price
0	0	Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...	appliances	Air Conditioners	https://m.media-amazon.com/images/I/31UISB90sY...	https://www.amazon.in/Lloyd-Inverter-Convertib...	4.2	2255.0	32,999	58,990
1	1	LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...	appliances	Air Conditioners	https://m.media-amazon.com/images/I/51JFb7FctD...	https://www.amazon.in/LG-Convertible-Anti-Viru...	4.2	2948.0	46,490	75,990
2	2	LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...	appliances	Air Conditioners	https://m.media-amazon.com/images/I/51JFb7FctD...	https://www.amazon.in/LG-Inverter-Convertible-...	4.2	1206.0	34,490	61,990

	count	mean	std	min	25%	50%	75%	max
Browser ID	551585.0	7006.200471	5740.835523	0.0	1550.0	5933.0	11482.0	19199.0
ratings	369558.0	3.832311	0.756102	1.0	3.5	3.9	4.3	5.0
no_of_ratings	369558.0	840.778698	8651.536018	1.0	4.0	20.0	133.0	589547.0

	ratings	no_of_ratings	discount_price	actual_price
ratings	1.000000	0.040689	0.055439	0.076250
no_of_ratings	0.040689	1.000000	0.014449	0.017019
discount_price	0.055439	0.014449	1.000000	0.848990
actual_price	0.076250	0.017019	0.848990	1.000000

A descriptive analysis of company website data¶

A diagnostic analysis on the reason for our results¶

for this, we'll be using a correlatin table to analyze¶

the relationship between each variables of the dataframe¶

	main_category	ratings
0	accessories	263218.9
18	women's clothing	241475.7
17	tv, audio & cameras	189407.5
10	men's clothing	158233.0
15	stores	111927.4
11	men's shoes	105521.0
1	appliances	105073.7
6	home & kitchen	52804.2
14	sports & fitness	33591.4
3	beauty & health	30908.1